********************************************************************************
* matching_worker_w_wage.do
* Purpose: ROBUSTNESS CHECK - Worker PSM adding within-firm wage rank as a
*          matching variable.
*
* Difference from baseline (matching_worker.do):
*   - Adds bin_t4earn (5 within-firm earnings quintile bins) to the exact-matching
*     cell definition: sector x province x sex x age x within-firm wage quintile.
*   - The pscore model also adds a quadratic in t4earn (earnings level).
*
* Rationale: Tests whether results are robust to matching workers on their
* pre-event wage rank within their firm (captures worker quality / position).
*
* Output: $data/worker_matched_w_wage.dta
*         $data/worker_matched_list_w_wage.dta
********************************************************************************
global start_year = 2005
global end_year = 2016

forvalues y = $start_year/$end_year {

	use $data/worker_`y', replace

	gen year_prior	=	year
	
	//------------------------------------------------------------------
	// Restrict to workers at matched firms
	// joinby links each worker to the firm-pair structure established by
	// matching_firm.do. After this, treated workers = at M&A firms;
	// control workers = at matched control firms.
	//------------------------------------------------------------------
	joinby entid_syn year_prior using $data/firm_matched_list
	drop pscore_og
	
	
	//------------------------------------------------------------------
	// Earnings restriction: real total earnings >= ~CAD 3,900
	//------------------------------------------------------------------
	gegen 	total_wage 		= sum(t4earn), by(casenum2019)
	gen		real_total_wage = total_wage/CPI_base_2011
	drop if real_total_wage < 3900 | mi(real_total_wage)
	
	//------------------------------------------------------------------
	// Worker eligibility restrictions
	//------------------------------------------------------------------
	* Drop moonlighters (workers with simultaneous jobs in any year)
	drop if moonlighter == 1

	* Drop workers with gaps in their employment history (max_gap > 1)
	drop 	if max_gap > 1 | mi(max_gap)

	* Drop workers who leave their current firm before t = 0
	* (i.e., not present at the treated firm in the event year)
	gen 	present_at_treated = (`y' + 1 <= last_year_at_firm)
	drop 	if present_at_treated == 0

	* Require at least 4 years of tenure (continuous attachment to the firm)
	drop 	if tenure < 4

	* Drop workers with missing demographic or sector information
	drop if mi(t1_age_recorded) | mi(t1_sex_recorded) | mi(naics2) | mi(OPAddressProvince)
	
	//------------------------------------------------------------------
	// Age bins: 5-year groups starting from age 17
	// (17-21 -> bin 0, 22-26 -> bin 1, etc.)
	//------------------------------------------------------------------
	gen bin_age = int(( t1_age_recorded - 17)/5)
	
	** Within-firm earnings quintile: KEY ADDITION relative to baseline.
	* Assigns each worker to one of 5 earnings rank bins within their firm.
	gquantiles 	bin_t4earn = t4earn, xtile n(5) by(entid_syn)
	
	* create interactions between bins
	* workers must be matches within the same cells
	gegen cell = group(naics2 OPAddressProvince t1_sex_recorded bin_*)
	drop if cell == .
	
	* estimate pscore using the eligible sample
	reg treated c.t1_age_recorded#c.t1_age_recorded c.t4earn#c.t4earn
	
	predict pscore
	
	replace pscore = pscore + 10.0*cell
	
	** we have to sort the data for replication
	gen outcome = rnormal()
	
	*** Fix the sorting of data
	gsort casenum2019
	
	* matching
	psmatch2 treated, outcome(outcome) pscore(pscore) caliper(1) n(1) noreplacement
	sum outcome

	gen 	id = _id if treated==0 & _weight==1
	replace id = _n1 if treated==1 & _weight==1
	
	drop pairid
	gegen pairid = group(id)
	gsort pairid
	

	drop if mi(id)
	keep casenum2019 entid_syn treated pairid year_prior bin_t4earn
	
	save $data/worker_matched_list_w_wage_`y',  replace 
	
}

**# matched worker list
drop _all
forvalues y = $start_year/$end_year {
	dis( "`y'")
	append using $data/worker_matched_list_w_wage_`y',  keep(casenum2019 entid_syn treated pairid year_prior bin_t4earn) force
}	

compress
save $data/worker_matched_list_w_wage, replace

**# matched worker panel
forvalues y = 2001/2017 {

	use $data/worker_`y', clear
	
	rename entid_syn temp_id
	
	* keep matched treated and control workers
	joinby casenum2019 	using $data/worker_matched_list_w_wage, unmatched(none)
	merge m:1 entid_syn using $data/first_mna, keep(1 3) keepusing(entid_syn) nogen

	** keep workers at dominant (M&A) firms **
	gen 	same_firm = (temp_id == entid_syn)
	drop 	entid_syn
	rename	temp_id entid_syn
	
	gegen total_wage = sum(t4earn), by(casenum2019 pairid year_prior)
	
	gsort casenum2019 year_prior -same_firm -t4earn entid_syn
	duplicates drop casenum2019 year_prior pairid, force
	
	drop same_firm
	
	compress
	save $data/worker_matched_`y', replace
}

drop _all
forvalues y = 2001/2017 {
	append using $data/worker_matched_`y', force
	erase $data/worker_matched_`y'.dta
}

save $data/worker_matched_w_wage_intermid, replace

//------------------------------------------------------------------------------
// STEP 5: CONSTRUCT THE EVENT-STUDY PANEL (worker_matched.dta)
//
// Adds all variables needed for worker event-study regressions:
//   - Numeric IDs (worker, firm, pair)
//   - Event time variable t and ds_k dummies
//   - Sector codes (NAICS2)
//   - M&A deal characteristics (acquirer flag, deal type)
//   - Job mobility variables (moved, moved_sec, moved_ind)
//   - Pair-level variables (matched_acq, matched_emoved)
//   - Present-at-firm indicator (for stayer analysis)
//------------------------------------------------------------------------------
gsort pairid year_prior -treated year

* Numeric IDs
gegen worker_id	= group(casenum2019)
gegen firm_id 	= group(entid_syn)
gegen id 		= group(worker_id year_prior)   // unique worker x matching-year ID

//------------------------------------------------------------------
// Event time variable and indicator dummies (same structure as matching_firm.do)
//------------------------------------------------------------------
gen 	t = year - (year_prior + 1)
replace t = 6 	if t >  5 	& ~mi(t)
replace t = -6 	if t < -5 	& ~mi(t)
tab t, gen(ds_)
levelsof t, local(ts)
local end = r(r)
forvalue i = 1(1)`end' {
	local temp = `i' - 7
	label variable ds_`i' "`temp'"
}
replace ds_6 = 0   // t = -1 is the omitted base period

//------------------------------------------------------------------
// Sector: 2-digit NAICS (same grouping as elsewhere)
//------------------------------------------------------------------
destring naics, replace
drop naics2
gen 	naics2 = int(naics/100)
replace naics2 = 31 if naics2 == 32 | naics2 == 33
replace naics2 = 44 if naics2 == 45
replace naics2 = 48 if naics2 == 49
replace naics2 = 54 if naics2 == 56 | naics2 == 61 | naics2 == 62

* Record sector at event year (used for subsample definitions)
gen naics2_event 	= naics2 if treated == 1 & year == year_prior
gen naics_event		= naics  if year == year_prior
gen firm_event 		= firm_id if year == year_prior

* Carry event-year sector forward/backward to fill gaps for the original firm
gegen naics_tmp 	= firstnm(naics_event), by(id)
gegen firmid_tmp 	= firstnm(firm_event),  by(id)
replace naics 		= naics_tmp if firm_id == firmid_tmp & year <= year_prior + 1

** merge in mna characteristics
merge	m:1 entid_syn 	using 	$data/first_mna, keep(1 3) keepusing(DEAL Acquirer merger) nogen
replace DEAL = . 		if t ~= -1 & treated == 1
replace Acquirer = . 	if t ~= -1 & treated == 1

//------------------------------------------------------------------
// Job mobility variables (computed from the matched worker panel)
//------------------------------------------------------------------
gsort id year

* moved == 1 if the worker changed firm or had a gap of >= 2 years
by id: 	gen year_diff	= year - year[_n-1]
by id: 	gen moved 		= (firm_id ~= firm_id[_n-1] | year_diff >= 2) & ~mi(year_diff)

* Summary mobility measures over the post-event window (t > 0)
gegen	total_moves		= total(moved * (t > 0) ), 			by(id)   // total post-event moves
gegen	ever_moved_post	= max(moved * (t > 0)), 			by(id)   // any post-event move
gegen	first_move_year	= min(year) if moved == 1 & t > 0, 	by(id)   // year of first post-event move
gen		t_moved			= first_move_year - (year_prior + 1)           // event time of first move

**# pair-level variables
gsort pairid year_prior -treated year

gegen matched_acq           = firstnm(Acquirer), 			by(pairid year_prior)
gegen matched_emoved		= firstnm(ever_moved_post), 	by(pairid year_prior)
gegen matched_wage_5		= firstnm(bin_t4earn), 			by(pairid year_prior) 

compress
save $data/worker_matched_w_wage, replace